#%matplotlib widget
[Kuehn, Kuntz] show that feed forward neural networks have embedding restrictions. We implement simple examples.
Example: A classification of a classical inner and outer circle data set
Outcome: We will observe that certain architectures fail to generate high accuracy despite a large amount of parameters due to the embedding restrictions
Architectures:
It holds that FFNN of constant input dimension width and with full rank parameter matrices are not able to generate singular points $x^*$, i.e. a Jacobian w.r.t. the inputs that is the 0 matrix at $x^*$. As such, any NN of two dimensional input with layers of width two cannot generate an input to output function that is topologically equivalent to $|x|^2$.
It further holds that even a bottle neck, that is a sequence of 3 layers where the middle layer has a strictly smaller dimension, cannot generate a singular point either. As a result no FFNN with at maximum 2 neurons per layer can approximate $|x|^2$.
There will be
CONTINUE HERE BY ADDING THE TRAINING DATA FROM THE OTHER FILE TO THIS FILE SO IT WORKS. I Already added the test_dataloader to the other file which was not used before there. but here i want to use it for accuracy.
# 🧪 Imports and data prep
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from models.training import make_circles_uniform, create_dataloader
from plots.plots import plot_decision_boundary
# seed = 43
seed = np.random.randint(1000)
torch.manual_seed(seed)
np.random.seed(seed)
depth = 15 # Number of layers
cross_entropy = True
batch_size=128
def make_xor(output_dim, n_samples = 2000, noise = 0.2, cross_entropy = False, plot = True, batch_size = 128, filename = None):
"""Generates xor
"""
# Generate training data
# set random seed for reproducibility
seed = np.random.randint(1000)
print(seed)
np.random.seed(seed)
torch.manual_seed(seed)
data = torch.randint(low=0, high=2, size=(n_samples, 2), dtype=torch.float32)
labels = np.logical_xor(data[:, 0] > 0, data[:, 1] > 0).float()
data += noise * torch.randn(data.shape) - 0.5
# Generate outer ring points
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=.2)
print(X_train[:5,:])
print(y_train[:5])
if plot:
# Plot the data
data_0 = X_train[y_train[:] == 0]
data_1 = X_train[y_train[:] == 1]
plt.figure(figsize=(8, 8))
plt.scatter(data_0[:, 0], data_0[:, 1], s=20, c='C1', alpha = 0.5, label='Ring Points')
plt.scatter(data_1[:, 0], data_1[:, 1], s=20, c='C0', alpha = 0.5, label='Inside Points')
plt.xlabel('X')
plt.ylabel('Y')
# plt.legend()
plt.title('Training Dataset: Ring and Inside Points')
plt.axis('equal')
plt.grid(True)
# Save plot if filename provided
if filename is not None:
plt.savefig(f'{filename}.png', bbox_inches='tight', dpi=300)
print(f'Plot saved as {filename}.png')
plt.show()
# Convert to tensors
data_tensor = torch.tensor(data, dtype=torch.float32)
if cross_entropy:
labels_tensor = torch.tensor(labels, dtype=torch.long)
print(labels_tensor[:1])
else:
labels_tensor = torch.tensor(labels, dtype=torch.float32)
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
if output_dim == 1:
y_train = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
y_test = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)
else:
y_train = torch.tensor(y_train, dtype=torch.float32) # Don't reshape for 2D outputs
y_test = torch.tensor(y_test, dtype=torch.float32)
# Create DataLoader for training data
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
# Create DataLoader for training data
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=n_samples, shuffle=False)
return train_dataloader, test_dataloader
train_loader, test_loader = make_xor(1)
133
tensor([[-0.6061, -0.7473],
[ 0.5979, -0.5322],
[ 0.5676, 0.7411],
[ 0.6931, -0.4058],
[ 0.4968, 0.8046]])
tensor([0., 1., 0., 1., 0.])
/var/folders/lk/x8186w8j3_s9plr83dmmwnm00000gn/T/ipykernel_61726/926991177.py:45: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). data_tensor = torch.tensor(data, dtype=torch.float32) /var/folders/lk/x8186w8j3_s9plr83dmmwnm00000gn/T/ipykernel_61726/926991177.py:51: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). labels_tensor = torch.tensor(labels, dtype=torch.float32) /var/folders/lk/x8186w8j3_s9plr83dmmwnm00000gn/T/ipykernel_61726/926991177.py:55: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). X_train = torch.tensor(X_train, dtype=torch.float32) /var/folders/lk/x8186w8j3_s9plr83dmmwnm00000gn/T/ipykernel_61726/926991177.py:56: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). X_test = torch.tensor(X_test, dtype=torch.float32) /var/folders/lk/x8186w8j3_s9plr83dmmwnm00000gn/T/ipykernel_61726/926991177.py:59: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). y_train = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32) /var/folders/lk/x8186w8j3_s9plr83dmmwnm00000gn/T/ipykernel_61726/926991177.py:60: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). y_test = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)
class FFnet(nn.Module):
def __init__(self, depth=depth, width=2, bottleneck=False, activation='tanh'):
super().__init__()
act_fn = nn.Tanh if activation == 'tanh' else nn.ReLU
self.activations = []
layers = []
in_features = 2
for i in range(depth):
out_features = 1 if bottleneck and i == depth - 2 else width
layers.append(nn.Linear(in_features, out_features))
layers.append(nn.BatchNorm1d(out_features))
layers.append(act_fn())
in_features = out_features
layers.append(nn.Linear(in_features, 1)) # output layer
self.net = nn.Sequential(*layers)
def forward(self, x, collect_activations=False):
# Add a batch dimension if input is 1D
if x.dim() == 1:
x = x.unsqueeze(0) # Shape becomes [1, features]
self.activations = []
for layer in self.net[:-1]:
x = layer(x)
if collect_activations and isinstance(layer, (nn.Tanh, nn.ReLU)):
self.activations.append(x.detach().cpu().numpy())
x = self.net[-1](x)
return torch.sigmoid(x)
def compute_accuracy(y_pred, y_true):
"""
y_pred: float32 predictions (sigmoid outputs)
y_true: float32 ground truth labels (0 or 1)
"""
y_pred_binary = (y_pred >= 0.5).int()
y_true_binary = y_true.int()
correct = (y_pred_binary == y_true_binary).sum().item()
total = y_true.shape[0]
return correct / total
import copy
def train_model(model, train_loader, test_loader,
epochs=300, lr=0.01, patience=300, batch_size=128):
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()
best_acc = 0
patience_counter = 0
losses = []
for epoch in range(epochs):
epoch_loss = 0
for batch_X, batch_y in train_loader:
y_pred = model(batch_X)
loss = criterion(y_pred, batch_y)
epoch_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(epoch_loss / len(train_loader))
# Evaluate on test data
model.eval()
with torch.no_grad():
acc_summed = 0.
counter = 0
for X_test, y_test in test_loader:
counter += 1
test_preds = model(X_test)
acc_summed += compute_accuracy(test_preds, y_test)
acc = acc_summed / counter
model.train()
if acc > best_acc:
best_acc = acc
best_model_state = copy.deepcopy(model.state_dict())
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
print(f"⏹️ Early stopping at epoch {epoch}, best acc: {best_acc:.3f}")
break
# At end, load the best model
if patience_counter > 0:
model.load_state_dict(best_model_state)
return model, best_acc, losses # <--- return the best model!
def train_until_threshold(model_class, train_loader, test_loader,
max_retries=10, threshold=0.95, **model_kwargs):
for attempt in range(1, max_retries + 1):
model = model_class(**model_kwargs)
model, acc, losses = train_model(model, train_loader, test_loader)
print(f"[Attempt {attempt}] Accuracy: {acc:.3f}")
if acc >= threshold:
print(f"✅ Success after {attempt} attempt(s)!")
return model, acc, losses
print("❌ Failed to reach threshold.")
return model, acc, losses
def plot_loss_curve(losses, title="Training Loss"):
plt.figure(figsize=(6, 4))
plt.plot(losses, label="Loss")
plt.xlabel("Epoch")
plt.ylabel("Binary Cross Entropy Loss")
plt.title(title)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
from matplotlib.colors import to_rgb, LinearSegmentedColormap
# def plot_decision_boundary(model, X, y, title="Decision Boundary", margin = 0.2):
# colors = [to_rgb("C0"), [1, 1, 1], to_rgb("C1")] # first color is orange, last is blue
# cm = LinearSegmentedColormap.from_list(
# "Custom", colors, N=40)
# model.eval()
# x_min, x_max = X[:, 0].min() - margin, X[:, 0].max() + margin
# y_min, y_max = X[:, 1].min() - margin, X[:, 1].max() + margin
# xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
# np.linspace(y_min, y_max, 200))
# grid = np.c_[xx.ravel(), yy.ravel()]
# grid_tensor = torch.tensor(grid, dtype=torch.float32)
# with torch.no_grad():
# preds = model(grid_tensor).numpy().reshape(xx.shape)
# plt.figure(figsize=(10, 8))
# plt.contourf(xx, yy, preds, levels=50, cmap=cm, alpha=0.8)
# # plt.scatter(X[:, 0], X[:, 1], c=y.squeeze(), cmap='bwr', edgecolors='k')
# plt.colorbar(label='Prediction Probability')
# plt.xlabel('X')
# plt.ylabel('Y')
# plt.scatter(X[:, 0], X[:, 1], s=25, c = y.squeeze(), cmap = cm, edgecolors='black', linewidths=0.5, alpha=0.9)
# # plt.scatter(inside_points[:500, 0], inside_points[:500, 1], s=25, c='C0', edgecolors='black', linewidths=0.5, alpha=0.5, label='Inside Points')
# plt.title(title)
# # plt.axis('equal')
# plt.grid(False)
# plt.xlim(-1, 1)
# plt.ylim(-1, 1)
# plt.axis('tight')
# plt.show()
# Train both models
model_base, acc_base, losses_base = train_until_threshold(
FFnet, train_loader, test_loader,
max_retries=20, threshold=0.95,
depth=15, width=2, bottleneck=False, activation='tanh'
)
model_bottleneck, acc_bottleneck, losses_bottleneck = train_until_threshold(
FFnet, train_loader, test_loader,
max_retries=10, threshold=0.95,
depth=15, width=2, bottleneck=True, activation='tanh'
)
[Attempt 1] Accuracy: 0.990 ✅ Success after 1 attempt(s)! [Attempt 1] Accuracy: 0.993 ✅ Success after 1 attempt(s)!
plot_loss_curve(losses_base, "Loss - Baseline Model")
plot_loss_curve(losses_bottleneck, "Loss - Bottleneck Model")
# Plot decision boundaries
X_test, y_test = next(iter(test_loader))
plot_decision_boundary(model_base, X_test.numpy(), y_test.numpy(), title="Baseline Model")
plot_decision_boundary(model_bottleneck, X_test.numpy(), y_test.numpy(), title="Bottleneck Model")
# from mpl_toolkits.mplot3d import Axes3D # needed for 3D plotting
# from mpl_toolkits.mplot3d import axes3d
# from matplotlib import cm # for colormaps
# def plot_decision_surface_3d(model, X, y, title="3D Prediction Surface"):
# model.eval()
# x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
# y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
# xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
# np.linspace(y_min, y_max, 200))
# grid = np.c_[xx.ravel(), yy.ravel()]
# grid_tensor = torch.tensor(grid, dtype=torch.float32)
# with torch.no_grad():
# preds = model(grid_tensor).numpy().reshape(xx.shape)
# fig = plt.figure(figsize=(12, 9))
# ax = fig.add_subplot(111, projection='3d')
# surf = ax.plot_surface(xx, yy, preds, cmap=cm.coolwarm, edgecolor='none', alpha=0.9)
# # Plot original data points in 3D (z=prediction)
# with torch.no_grad():
# data_preds = model(torch.tensor(X, dtype=torch.float32)).numpy()
# ax.scatter(X[:, 0], X[:, 1], data_preds, c=y.squeeze(), cmap='bwr', edgecolor='k', s=30)
# ax.set_xlabel('X')
# ax.set_ylabel('Y')
# ax.set_zlabel('Prediction')
# ax.set_title(title)
# fig.colorbar(surf, ax=ax, shrink=0.5, aspect=10, label='Prediction Value')
# plt.tight_layout()
# plt.show()
# plot_decision_surface_3d(model_base, X_test.numpy(), y_test.numpy(), title="3D Decision Surface - Baseline Model")
# plot_decision_surface_3d(model_bottleneck, X_test.numpy(), y_test.numpy(), title="3D Decision Surface - Bottleneck Model")
The tunnels of blue class prediction values through the orange outer area is evidence of the embedding restrictions. Through more layers it is possible, that the tunnel gets narrower and narrower (further improving accuracy) but due to the embedding restictions, it cannot disappear.
I am wondering if bottle necks improve training or if high accuracy correlates with SVs in some way
def plot_singular_values_of_weightmatrix(model, log_scale = True, title=''):
"""
For each Linear layer in the model, compute the singular values using torch.svd,
and plot them by layer index (x-axis) vs. singular value (y-axis).
This version ensures all singular values are visible and x-ticks align with integer layer indices.
"""
linear_layers = [module for module in model.modules() if isinstance(module, nn.Linear)]
all_singular_values = []
max_sv = float('-inf')
min_sv = float('inf')
for i, layer in enumerate(linear_layers):
W = layer.weight.detach().cpu()
try:
_, S, _ = torch.svd(W)
sv_numpy = S.numpy()
all_singular_values.append((i, sv_numpy))
max_sv = max(max_sv, sv_numpy.max())
min_sv = min(min_sv, sv_numpy.min())
except RuntimeError:
print(f"⚠️ torch.svd failed on Layer {i+1} — possibly due to singularity.")
all_singular_values.append((i, []))
# Plot each singular value as a separate point per layer
plt.figure()
for layer_idx, svals in all_singular_values:
for sv in svals:
plt.scatter(layer_idx, sv, color='blue', alpha=0.4)
plt.title("SVs: " + title)
plt.xlabel("Layer Index")
plt.ylabel("Singular Value (log scale)")
if log_scale:
plt.yscale("log")
# Ensure y-axis covers all singular values
y_max = max_sv * 1.2
plt.ylim([1e-4, y_max])
# Use integer x-ticks only for layer indices
layer_indices = list(range(len(linear_layers)))
plt.xticks(layer_indices)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()
# Plot activation heatmaps
plot_singular_values_of_weightmatrix(model_base, title = 'Baseline model')
plot_singular_values_of_weightmatrix(model_base, log_scale=False, title = 'Baseline model')
plot_singular_values_of_weightmatrix(model_bottleneck, title = 'Bottleneck model')
Models with augmented width have no embedding restrictions and we can observe that a singular value close to the origin is constructed almost immediatly
def plot_weight_heatmaps(model, title=''):
import matplotlib.pyplot as plt
import numpy as np
linear_layers = [module for module in model.modules() if isinstance(module, nn.Linear)]
n_layers = len(linear_layers)
if n_layers == 0:
print("No Linear layers with parameters found.")
return
fig, axes = plt.subplots(1, n_layers, figsize=(3 * n_layers, 3), squeeze=False)
axes = axes[0]
for i, layer in enumerate(linear_layers):
weight = layer.weight.detach().cpu().numpy()
weight = abs(weight)
ax = axes[i]
im = ax.imshow(weight, cmap='viridis', vmin = 0, vmax = 5, aspect='equal')
ax.set_title(f"Layer {i}")
ax.set_xlabel("Out")
ax.set_ylabel("In")
plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.suptitle("Weight Matrices Heatmaps - " + title)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
model_wide, acc_wide, losses_wide = train_until_threshold(
FFnet, train_loader, test_loader,
max_retries=20, threshold=0.95,
depth=2, width=3, bottleneck=False, activation='tanh'
)
[Attempt 1] Accuracy: 0.995 ✅ Success after 1 attempt(s)!
plot_decision_boundary(model_wide, X_test.numpy(), y_test.numpy(), title="Wide Model")
plot_weight_heatmaps(model_wide, title="Wide Model")
The heatmap of the layer 1 shows that the inner shape is constructed by almost equally including the 3 different neurons of the 0th layer creating a smoothed out triangle shape
# Define a grid over the input space.
grid_size = 100 # Adjust as needed.
def psi_manual(x, func):
"""
x: a tensor of shape (2,) representing a point in R^2.
model: a function mapping R^2 to R^2.
Returns:
The smallest singular value of the Jacobian of model at x.
"""
# Ensure x is a leaf variable with gradient tracking enabled.
x = x.clone().detach().requires_grad_(True)
# Define a lambda function to ensure accurate input-output mapping
# func = lambda inp: model(inp, output_layer = False)
# Compute the Jacobian using torch.autograd.functional.jacobian (compatible with Python 3.8)
jacobian = torch.autograd.functional.jacobian(func, x, create_graph=True)
# Compute singular values using svdvals (available in PyTorch 1.8, compatible with Python 3.8)
singular_values = torch.svd(jacobian, compute_uv=False)[1]
return singular_values.detach().numpy()
x_range = np.linspace(-2, 2, grid_size)
y_range = np.linspace(-2, 2, grid_size)
psi_values = np.zeros((grid_size, grid_size, 2))
# Put the model in evaluation mode.
model_wide.eval()
func = lambda inp: model_wide(inp)
# Evaluate psi(x) over the grid.
for i, xv in enumerate(x_range):
for j, yv in enumerate(y_range):
# Create a 2D point as a torch tensor.
x_point = torch.tensor([xv, yv], dtype=torch.float32)
psi_values[j, i,:] = psi_manual(x_point, func) #one subtlety here: if there is only one SV it gets broadcast to all dimensions of psi_values[j,i,:] in the last dimension. this reduces if statements for e.g. the last layer, but we need to notice that the SINGLE SV gets plotted twice
def compute_singular_values_grid(model, grid_size=200):
x_range = np.linspace(-1, 1, grid_size)
y_range = np.linspace(-1, 1, grid_size)
psi_values = np.zeros((grid_size, grid_size, 2))
model.eval()
func = lambda inp: model(inp)
for i, xv in enumerate(x_range):
for j, yv in enumerate(y_range):
x_point = torch.tensor([xv, yv], dtype=torch.float32)
sv = psi_manual(x_point, func)
# If only one singular value is returned (e.g., at the final layer), broadcast it
if sv.shape[0] == 1:
psi_values[j, i, :] = sv[0]
else:
psi_values[j, i, :] = sv
return x_range, y_range, psi_values
from matplotlib import colors
def plot_singular_values(x_range, y_range, psi_values, title="Singular Values"):
min_sv = psi_values[:, :, 0]
max_sv = psi_values[:, :, 1]
plt.figure(figsize=(10, 4))
# Use log spacing for levels (add epsilon to avoid log(0))
epsilon = 1e-4
num_levels = 50
min_sv_max = np.max(min_sv)
max_sv_max = np.max(max_sv)
min_levels = np.geomspace(epsilon, min_sv_max + epsilon, num_levels) - epsilon
max_levels = np.geomspace(epsilon, max_sv_max + epsilon, num_levels) - epsilon
# Create color mapping that matches the nonlinear levels
min_norm = colors.BoundaryNorm(min_levels, ncolors=256)
max_norm = colors.BoundaryNorm(max_levels, ncolors=256)
# Plot min singular value
plt.subplot(1, 2, 1)
contour1 = plt.contourf(x_range, y_range, min_sv, levels=min_levels, cmap='viridis', norm=min_norm)
plt.colorbar(contour1, label='Min Singular Value')
plt.title(f'{title} (Min)')
plt.xlabel('x')
plt.ylabel('y')
# Plot max singular value
plt.subplot(1, 2, 2)
contour2 = plt.contourf(x_range, y_range, max_sv, levels=max_levels, cmap='viridis', norm = max_norm)
plt.colorbar(contour2, label='Max Singular Value')
plt.title(f'{title} (Max)')
plt.xlabel('x')
plt.ylabel('y')
plt.tight_layout()
plt.show()
models = [
(model_base, "Baseline Model"),
(model_bottleneck, "Bottleneck Model"),
(model_wide, "Wide Model")
]
for model, name in models:
x_range, y_range, psi_vals = compute_singular_values_grid(model, grid_size=100)
plot_singular_values(x_range, y_range, psi_vals, title=name)
The SV plots show the very different structure of the NNs with 2 neurons per layer and the one with 3 neurons.
model_test = FFnet(depth=15, width=2, bottleneck=True, activation='tanh')
epoch_step = 10
losses_combined = []
for plot in range(1,10):
_, _, losses_running = train_model(model_test, train_loader, test_loader,
epochs=epoch_step, lr=0.01, patience=300, batch_size=128)
plot_decision_boundary(model_test, X_test.numpy(), y_test.numpy(), title="Bottleneck Model at " + str(epoch_step*plot))
plot_singular_values_of_weightmatrix(model_test, title = 'Bottleneck model at ' + str(50*plot))
for loss in losses_running:
losses_combined.append(loss)
print(losses_combined)
plot_loss_curve(losses_combined, "Loss - Baseline Model")
[0.6965424739397489, 0.5752353897461524, 0.4810584577230307, 0.4117594430079827, 0.3779756587285262, 0.3300498976157262, 0.3064667788835672, 0.259887444285246, 0.2780483995492642, 0.26515413362246293, 0.22498109592841223, 0.24632233266647047, 0.19795817423325318, 0.24185058933037978, 0.19006814119907525, 0.19988700460929137, 0.16987603558943823, 0.15209551786000913, 0.1539547351690439, 0.18359227765064973, 0.20736427089342704, 0.14966382602086434, 0.19281824162373176, 0.18407103877801162, 0.15142951619166595, 0.14850046772223252, 0.14585711606420004, 0.13917149717991167, 0.13726713279118904, 0.15172484803658265, 0.12581464992119715, 0.13073371751950338, 0.11319464402130017, 0.14484896338902986, 0.16721343220426485, 0.16327568315542662, 0.1058241931291727, 0.11956280183333617, 0.1176777367408459, 0.1153460918710782, 0.1548682442651345, 0.13254589959979057, 0.1348826541350438, 0.11760929570748256, 0.11126964023480049, 0.3665006063305415, 0.1134775229371511, 0.11239169709957562, 0.11531591673309986, 0.10562307387590408, 0.12336473954984775, 0.10926228016614914, 0.0983461425281488, 0.1242283508181572, 0.10321946470783307, 0.10291810792226058, 0.10998822490756328, 0.36429238204772657, 0.12433616645061053, 0.10149291902780533, 0.10769809123415214, 0.08973340718792035, 0.08852121864373867, 0.08973426314500663, 0.106862916969336, 0.0969295699435931, 0.34449393445482623, 0.12553968380850095, 0.08259885385632515, 0.08214135926503402, 0.11008119869690675, 0.09279595372768548, 0.08628207874985841, 0.09327286266936706, 0.08619415530791649, 0.10045990605766957, 0.08735257845658523, 0.10031783236907078, 0.10354544136386651, 0.07087039345732102, 0.08762823847623971, 0.09958666992875245, 0.08536100158324608, 0.08614343404769897, 0.09473313004351579, 0.07214745563956407, 0.07909086661843154, 0.08399546676530288, 0.08484123303340031, 0.09024054915286027]
model_test = FFnet(depth=15, width=2, bottleneck=False, activation='tanh')
epoch_step = 10
losses_combined = []
for plot in range(1,10):
_, _, losses_running = train_model(model_test, train_loader, test_loader,
epochs=epoch_step, lr=0.01, patience=300, batch_size=128)
plot_decision_boundary(model_test, X_test.numpy(), y_test.numpy(), title="Base Model at " + str(epoch_step*plot))
plot_singular_values_of_weightmatrix(model_test, log_scale=False, title = 'Base model at ' + str(epoch_step*plot))
for loss in losses_running:
losses_combined.append(loss)
model_relu = FFnet(depth=15, width=2, bottleneck=False, activation='relu')
epoch_step = 50
losses_combined = []
for plot in range(1,10):
_, _, losses_running = train_model(model_relu, train_loader, test_loader,
epochs=epoch_step, lr=0.01, patience=300, batch_size=128)
plot_decision_boundary(model_relu, X_test.numpy(), y_test.numpy(), title="Base Model at " + str(epoch_step*plot))
plot_singular_values_of_weightmatrix(model_relu, log_scale=False, title = 'Base model at ' + str(epoch_step*plot))
for loss in losses_running:
losses_combined.append(loss)